import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,LabelEncoder
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
# %matplotlib inline
data = pd.read_csv('Data for Associate DS.csv',)
data.head()
data.info()
data.describe()
#finding if there is any missing value
data.isnull().sum()
data.nunique()
#imputing the missing values with median
data.CREDIT_LIMIT.fillna(data.CREDIT_LIMIT.median(),inplace = True)
data.MINIMUM_PAYMENTS.fillna(data.MINIMUM_PAYMENTS.median(),inplace = True)
data.isna().sum()
#The average monthly purchase will be the total purchases
data['Monthly_avg_purchase'] = data['PURCHASES'] / data['TENURE']
# different tenure values of different customers
print(data.groupby('TENURE')['TENURE'].count())
data['Monthly_cash_advance'] = data['CASH_ADVANCE'] / data['TENURE']
data.loc[:,['ONEOFF_PURCHASES','INSTALLMENTS_PURCHASES']]
plt.figure(figsize = (8,7))
sns.set_style('darkgrid')
ax = sns.scatterplot(data = data,y = data.ONEOFF_PURCHASES,x = data.INSTALLMENTS_PURCHASES,marker = '.',s = (200,200))
plt.show()
def purchase_type(data):
if(data['ONEOFF_PURCHASES'] == 0) & (data['INSTALLMENTS_PURCHASES'] == 0):
return 'No Purchase'
if(data['ONEOFF_PURCHASES'] > 0) & (data['INSTALLMENTS_PURCHASES'] == 0):
return 'One Off Purchase'
if(data['ONEOFF_PURCHASES'] == 0) & (data['INSTALLMENTS_PURCHASES'] > 0):
return 'Installment '
if(data['ONEOFF_PURCHASES'] > 0) & (data['INSTALLMENTS_PURCHASES'] > 0):
return 'Both'
data['Purchase_Type'] = data.apply(purchase_type,axis = 1)
# We can know the different types of categories of customer by looking at the type of purchases
data['Purchase_Type'].value_counts()
sns.countplot(data = data,x = 'Purchase_Type')
plt.title('Count of distinct purchase types')
plt.show()
data['Limit Usage'] = data['BALANCE'] / data['CREDIT_LIMIT']
# data['Limit Usage'] = data['Limit Usage'] / 100
data['Limit Usage']
sns.lineplot(data = data,y = data['Limit Usage'] * 100,x = 'CREDIT_LIMIT',)
plt.ylabel('Limit Usage in %')
plt.ylim(0, 200)
plt.show()
## Number of people who spend more than their credit limit:
data[data['Limit Usage'] > 1].shape[0]
data['Pay_To_Min_Pay_Ratio'] = data['PAYMENTS'] / data['MINIMUM_PAYMENTS']
data.Pay_To_Min_Pay_Ratio
data['Purchase_Type'].describe()
def purchase_class(data):
if(data['PURCHASES_FREQUENCY'] * 100 < 25):
return '0 to 24'
if(data['PURCHASES_FREQUENCY'] * 100 < 50):
return '25 to 49'
if(data['PURCHASES_FREQUENCY'] * 100 < 75):
return '50 to 74'
return '75 to 100'
data['PURCHASES_FREQUENCY_CLASS'] = data.apply(purchase_class,axis = 1)
data.PURCHASES_FREQUENCY_CLASS.value_counts()
#plotting the purchase frequency class KPI:
sns.countplot(data = data,x = 'PURCHASES_FREQUENCY_CLASS')
plt.title('Count of different purchases frequency percentages:')
plt.show()
data.describe()
# dropping the cutomer id column
data.drop(['CUST_ID'],axis = 1,inplace=True)
data.columns
cont_features = []
for i in data.columns:
if((data[i].dtype) == 'int64' or (data[i].dtype) == 'float64'):
cont_features.append(i)
cont_features
# standardizing the columns with mean > 0 or std > 1
sc = StandardScaler()
for i in cont_features:
data[i] = sc.fit_transform(data[i].values.reshape(-1,1))
data.head()
# encoding categorical variables
cat_features = []
for i in data.columns:
if((data[i].dtype) == 'object'):
cat_features.append(i)
cat_features
label_encoder = LabelEncoder()
for i in cat_features:
data[i] = label_encoder.fit_transform(data[i].values.reshape(-1,1))
data[cat_features].nunique()
sns.heatmap(data.corr())
plt.show()
covariance_matrix = np.cov(data.T)
covariance_matrix[0:3]
eigen_values,eigen_vectors = np.linalg.eig(covariance_matrix)
eigen_vectors[0:3]
eigen_vec_svd,s , v = np.linalg.svd(data.T)
eigen_vec_svd[0:3]
for val in eigen_values:
print(val)
var_explained = [(i/sum(eigen_values)) * 100 for i in eigen_values]
var_explained
sns.lineplot(x = [i for i in range(len(var_explained))]
,y = np.cumsum(var_explained))
plt.xlabel("Number of components used")
plt.ylabel("% Variance retrieved")
plt.show()
projection_matrix = eigen_vectors[:,0:6]
projection_matrix
data_pca = data.dot(projection_matrix)
data_pca # Final data after dimensionality reduction
data[cat_features].nunique()
squared_dist = []
list_k = list(range(1, 21))
for k in list_k:
model = KMeans(n_clusters = k,)
model.fit(data_pca)
squared_dist.append(model.inertia_)
# Plot squared_error against k
plt.figure(figsize=(8, 6))
plt.plot(list_k, squared_dist, '-o')
plt.xlabel('Number of clusters * k *')
plt.ylabel('Sum of squared distance')
plt.show()
model = KMeans(n_clusters = 4, init = 'k-means++', max_iter = 300, n_init = 10,random_state = 42)
model.fit(data_pca)
y_kmeans = model.predict(data_pca)
X = data_pca
plt.scatter(X[y_kmeans==0], X[y_kmeans==0], s=100, c='red', label ='Cluster 1')
plt.scatter(X[y_kmeans==1], X[y_kmeans==1], s=100, c='blue', label ='Cluster 2')
plt.scatter(X[y_kmeans==2], X[y_kmeans==2], s=100, c='green', label ='Cluster 3')
plt.scatter(X[y_kmeans==3], X[y_kmeans== 3], s=100, c='yellow', label ='Cluster 4')
# plt.scatter(X[y_kmeans==4], X[y_kmeans==4], s=100, c='yellow', label ='Cluster 5')
# plt.scatter(X[y_kmeans==5], X[y_kmeans==5], s=100, c='pink', label ='Cluster 6')
# plt.scatter(X[y_kmeans==6], X[y_kmeans==6], s=100, c='magenta', label ='Cluster 7')
plt.show()
pd.Series(model.labels_).value_counts()
cluster_data = pd.concat([data[data.columns],pd.Series(model.labels_,name='Cluster')],axis = 1)
cluster_data_group = cluster_data.groupby('Cluster')\
.apply(lambda x: x[data.columns].mean()).T
cluster_data_group
fig,ax = plt.subplots(figsize=(15,10))
index = np.arange(len(cluster_data_group.columns))
cash_advance = np.log(cluster_data_group.loc['Monthly_cash_advance',:].values)
credit_score = (cluster_data_group.loc['Limit Usage',:].values)
purchase = np.log(cluster_data_group.loc['Monthly_avg_purchase',:].values)
installment = cluster_data_group.loc['INSTALLMENTS_PURCHASES',:].values
one_off = cluster_data_group.loc['ONEOFF_PURCHASES_FREQUENCY',:].values
bar_width = .10
b1 = plt.bar(index,cash_advance,color = 'r',label = 'Monthly cash advance',width = bar_width)
b2 = plt.bar(index + bar_width,credit_score,color = 'black',label = 'Credit_score',width = bar_width)
b3 = plt.bar(index + 2 * bar_width,purchase,color = 'orange',label = 'Avg purchase',width = bar_width)
b4 = plt.bar(index + 3 * bar_width,installment,color = 'b',label = 'installment',width = bar_width)
b5 = plt.bar(index + 4 * bar_width,one_off,color = 'g',label = 'One_off purchase',width = bar_width)
plt.xlabel("Clusters")
plt.title("Key Insights")
plt.xticks(index + bar_width, ('Cluster1', 'Cluster2', 'Cluster3', 'Cluster4'))
plt.legend()
plt.show()